library(tidyverse)
library(plotly)

# Read in csv files
ny <- read.csv('new_york_city.csv')
wash <- read.csv('washington.csv')
chi <- read.csv('chicago.csv')
# Displays first rows of each dataframe
head(ny)
##         X          Start.Time            End.Time Trip.Duration
## 1 5688089 2017-06-11 14:55:05 2017-06-11 15:08:21           795
## 2 4096714 2017-05-11 15:30:11 2017-05-11 15:41:43           692
## 3 2173887 2017-03-29 13:26:26 2017-03-29 13:48:31          1325
## 4 3945638 2017-05-08 19:47:18 2017-05-08 19:59:01           703
## 5 6208972 2017-06-21 07:49:16 2017-06-21 07:54:46           329
## 6 1285652 2017-02-22 18:55:24 2017-02-22 19:12:03           998
##             Start.Station            End.Station  User.Type Gender Birth.Year
## 1 Suffolk St & Stanton St W Broadway & Spring St Subscriber   Male       1998
## 2 Lexington Ave & E 63 St        1 Ave & E 78 St Subscriber   Male       1981
## 3       1 Pl & Clinton St   Henry St & Degraw St Subscriber   Male       1987
## 4   Barrow St & Hudson St        W 20 St & 8 Ave Subscriber Female       1986
## 5         1 Ave & E 44 St        E 53 St & 3 Ave Subscriber   Male       1992
## 6     State St & Smith St    Bond St & Fulton St Subscriber   Male       1986
head(wash)
##         X          Start.Time            End.Time Trip.Duration
## 1 1621326 2017-06-21 08:36:34 2017-06-21 08:44:43       489.066
## 2  482740 2017-03-11 10:40:00 2017-03-11 10:46:00       402.549
## 3 1330037 2017-05-30 01:02:59 2017-05-30 01:13:37       637.251
## 4  665458 2017-04-02 07:48:35 2017-04-02 08:19:03      1827.341
## 5 1481135 2017-06-10 08:36:28 2017-06-10 09:02:17      1549.427
## 6 1148202 2017-05-14 07:18:18 2017-05-14 07:24:56       398.000
##                                 Start.Station
## 1                        14th & Belmont St NW
## 2                  Yuma St & Tenley Circle NW
## 3              17th St & Massachusetts Ave NW
## 4            Constitution Ave & 2nd St NW/DOL
## 5 Henry Bacon Dr & Lincoln Memorial Circle NW
## 6                               1st & K St SE
##                                           End.Station  User.Type
## 1                                      15th & K St NW Subscriber
## 2                        Connecticut Ave & Yuma St NW Subscriber
## 3                                       5th & K St NW Subscriber
## 4                          M St & Pennsylvania Ave NW   Customer
## 5                               Maine Ave & 7th St SW Subscriber
## 6 Eastern Market Metro / Pennsylvania Ave & 7th St SE Subscriber
head(chi)
##         X          Start.Time            End.Time Trip.Duration
## 1 1423854 2017-06-23 15:09:32 2017-06-23 15:14:53           321
## 2  955915 2017-05-25 18:19:03 2017-05-25 18:45:53          1610
## 3    9031 2017-01-04 08:27:49 2017-01-04 08:34:45           416
## 4  304487 2017-03-06 13:49:38 2017-03-06 13:55:28           350
## 5   45207 2017-01-17 14:53:07 2017-01-17 15:02:01           534
## 6 1473887 2017-06-26 09:01:20 2017-06-26 09:11:06           586
##                   Start.Station                  End.Station  User.Type Gender
## 1          Wood St & Hubbard St      Damen Ave & Chicago Ave Subscriber   Male
## 2           Theater on the Lake Sheffield Ave & Waveland Ave Subscriber Female
## 3            May St & Taylor St          Wood St & Taylor St Subscriber   Male
## 4 Christiana Ave & Lawrence Ave St. Louis Ave & Balmoral Ave Subscriber   Male
## 5        Clark St & Randolph St Desplaines St & Jackson Blvd Subscriber   Male
## 6  Clinton St & Washington Blvd         Canal St & Taylor St Subscriber   Male
##   Birth.Year
## 1       1992
## 2       1992
## 3       1981
## 4       1986
## 5       1975
## 6       1990
# Data manipulation 

calculate_age <- function(df, birth_year, start_time) {
  #' Calculates user age by subtracting birth year from start year

  # Converts Start.Time to Date datatype
  df[[start_time]] <- as.Date(df[[start_time]])
  
  # Extracts year from Start.Time
  start_year <- year(df[[start_time]])
  
  # Calculates difference in years
  age_difference <- start_year - df[[birth_year]]

  # Returns age
  return(age_difference)
}

# Creating new column for age and applying function
ny$age <- calculate_age(ny, "Birth.Year", "Start.Time")
chi$age <- calculate_age(chi, "Birth.Year", "Start.Time")

# Creating new column for city name
ny$city <- "New York"
chi$city <- "Chicago"
wash$city <- "Washington"

# Combining ny and chi dataframes into one dataframe
ny_chi <- rbind(ny, chi)

# Removing dissimilar columns compared to wash
ny_trunc <- ny %>%
  select(-one_of("Gender", "Birth.Year", "age"))

chi_trunc <- chi %>%
  select(-one_of("Gender", "Birth.Year", "age"))

# Combining ny, chi, and wash dataframes into one dataframe
all_city <- rbind(ny_trunc, chi_trunc, wash)

How do subscriber ages compare by gender in Chicago versus New York?


subscriber_age <- ny_chi %>%
  # Filters for Subscribers, non-null age, and non-null gender
  filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
  group_by(city, Gender, age) %>%
  summarise(subscriber_count = n())

# Finds highest subscriber count for females in New York
max_f_ny <- subscriber_age %>%
  filter(city == "New York", Gender == "Female") %>%
  arrange(desc(subscriber_count)) %>%
  slice(1)

max_f_ny
## # A tibble: 1 × 4
## # Groups:   city, Gender [1]
##   city     Gender   age subscriber_count
##   <chr>    <chr>  <dbl>            <int>
## 1 New York Female    28              574
# Finds highest subscriber count for males in New York
max_m_ny <- subscriber_age %>%
  filter(city == "New York", Gender == "Male") %>%
  arrange(desc(subscriber_count)) %>%
  slice(1)

max_m_ny
## # A tibble: 1 × 4
## # Groups:   city, Gender [1]
##   city     Gender   age subscriber_count
##   <chr>    <chr>  <dbl>            <int>
## 1 New York Male      32             1486
# Finds highest subscriber count for females in Chicago
max_f_chi <- subscriber_age %>%
  filter(city == "Chicago", Gender == "Female") %>%
  arrange(desc(subscriber_count)) %>%
  slice(1)

max_f_chi
## # A tibble: 1 × 4
## # Groups:   city, Gender [1]
##   city    Gender   age subscriber_count
##   <chr>   <chr>  <dbl>            <int>
## 1 Chicago Female    25              114
# Finds highest subscriber count for males in Chicago
max_m_chi <- subscriber_age %>%
  filter(city == "Chicago", Gender == "Male") %>%
  arrange(desc(subscriber_count)) %>%
  slice(1)

max_m_chi
## # A tibble: 1 × 4
## # Groups:   city, Gender [1]
##   city    Gender   age subscriber_count
##   <chr>   <chr>  <dbl>            <int>
## 1 Chicago Male      28              287
# Finds count for females and males in each city
f_m_count <- ny_chi %>%
  filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
  group_by(city, Gender) %>%
  summarise(count = n())

f_m_count
## # A tibble: 4 × 3
## # Groups:   city [2]
##   city     Gender count
##   <chr>    <chr>  <int>
## 1 Chicago  Female  1723
## 2 Chicago  Male    5159
## 3 New York Female 12159
## 4 New York Male   36625
graph1 <- ny_chi %>%
  # Filters for Subscribers, non-null age, and non-null gender
  filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
  # Creates histogram with labels
  ggplot(aes(x = age)) + 
  geom_histogram(binwidth=5, color="black", fill="lightblue") + 
  labs(title="Distribution of Age by Gender in New York vs Chicago", x="Age", y="Count") + 
  # facets by gender and city
  facet_grid(city~Gender) +
  theme_minimal() + 
  scale_x_continuous(breaks = seq(0, 100, by=10))

# Converts to a plotly graph for tooltip functionality
ggplotly(graph1)

Summary

The histograms display the distribution of age faceted by gender for New York and Chicago. While the counts for Chicago are lower than the counts for New York, it can be seen that for both cities, there are more men than women riders. In New York, there are 36,625 men compared to 12,159 women. In Chicago, there are 5,159 men and 1,723 women. Both cities’ histograms are also skewed right for both men and women. It appears that most riders are between people in their early twenties to mid-thirties, regardless of city. The most frequent age for female subscribers in New York is 28 while it is 32 for male subscribers. The most frequent age for female subscribers in Chicago is 25 while it is 28 for males in Chicago.

How does the average trip duration compare by user type across different cities?


# Creates new column for converting trip duration from seconds to minutes
all_city$Trip.Duration.Mins <- all_city$Trip.Duration / 60

avg_trip <- all_city %>%
  # Filters for non-null user types
  filter(User.Type == "Subscriber" | User.Type == "Customer") %>%
  group_by(city, User.Type) %>%
  # Calculating average trip duration by city and user type
  summarise(Average = mean(Trip.Duration.Mins, na.rm=T))

avg_trip
## # A tibble: 6 × 3
## # Groups:   city [3]
##   city       User.Type  Average
##   <chr>      <chr>        <dbl>
## 1 Chicago    Customer      32.2
## 2 Chicago    Subscriber    11.4
## 3 New York   Customer      36.6
## 4 New York   Subscriber    12.6
## 5 Washington Customer      43.9
## 6 Washington Subscriber    12.2
graph2 <- avg_trip %>%
  # Creates grouped bar chart with labels
  ggplot(aes(x=city, y=Average, fill=User.Type)) + 
  geom_col(position = "dodge") + 
  labs(title="Average Trip Duration by User Type by City", 
       x="City", 
       y="Trip Duration (Minutes)", 
       fill="User Type") +
  theme_minimal() + 
  scale_fill_brewer(palette="Paired")

# Converts to a plotly graph for tooltip functionality
ggplotly(graph2)


Summary

The grouped bar chart shows that across all cities in the data, customers take longer average trips than subscribers. Customers in Washington seem to take the longest trips on average, followed by New York, and then Chicago at 43.9 minutes, 36.6 minutes, and 32.2 minutes, respectively. The subscribers across all cities take similar trip duration on average at 11.4 minutes for Chicago, 12.6 minutes for New York and 12.2 minutes for Washington. This could be due to one-time customers tending to be tourists and subscribers tending to be locals using bike sharing for commuting to work, however this cannot be proven or denied based on the dataset.

How do daily and hourly usage rates compare by city?


New York

extract_hour_day <- function(df, start_time) {
  #' Extracts the hour and day of the week from Start.Time and order the days of the week
  
  # Convert Start.Time to POSIXct datetime format
  df[[start_time]] <- as.POSIXct(df[[start_time]], format = "%Y-%m-%d %H:%M:%S")
  
  # Extract hour and day of the week from Start.Time
  df$hour <- format(df[[start_time]], "%H")
  df$day_of_week <- format(df[[start_time]], "%A")
  
  # Order days of the week
  df$day_of_week <- factor(df$day_of_week, levels=c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday",
                                                    "Tuesday", "Monday"))
  return(df)
}

usage_df <- function(df) {
  #' Filters, groups by day of the week and hour, and finds trip count for the df

  # Filters and summarises dataframe
  usage_detail <- df %>%
    filter(!(is.na(day_of_week)) & !(is.na(hour))) %>%
    group_by(day_of_week, hour) %>%
    summarise(trip_count = n(), .groups = 'drop')
  
  return(usage_detail)
}

plot_heatmap <- function(df) {
  #' Plots the usage detail df in a heatmap
  
   plot <- ggplot(df, aes(x = hour, y = day_of_week, fill = trip_count)) +
    # Creates heatmap with labels
    geom_tile() + 
    scale_fill_gradient(low = "lightblue", high = "darkblue") +
    theme_minimal() +
    labs(title = "Bike Usage by Hour and Day of the Week",
         x = "Hour of the Day",
         y = "Day of the Week",
         fill = "Trip Count")
   
   return(plot)
}

# Extracts hour and day of week from New York dataframe
hour_day_ny <- extract_hour_day(ny, "Start.Time")

# Returns usage dataframe
detail_ny <- usage_df(hour_day_ny)

# Finds top 5 highest usage times
usage_summary <- detail_ny %>%
  arrange(desc(trip_count)) %>%
  slice(c(1, n()))

# Prints top 5 usage times
usage_summary
## # A tibble: 2 × 3
##   day_of_week hour  trip_count
##   <fct>       <chr>      <int>
## 1 Wednesday   17          1069
## 2 Tuesday     03             4
# Returns heatmap for New York dataframe
plot_heatmap(detail_ny)

Chicago


# Extracts hour and day of week from Chicago dataframe
hour_day_chi <- extract_hour_day(chi, "Start.Time")

# Returns usage dataframe
detail_chi <- usage_df(hour_day_chi)

# Finds top 5 highest usage times
usage_summary2 <- detail_chi %>%
  arrange(desc(trip_count)) %>%
  slice(c(1, n()))

# Prints top 5 usage times
usage_summary2
## # A tibble: 2 × 3
##   day_of_week hour  trip_count
##   <fct>       <chr>      <int>
## 1 Thursday    17           208
## 2 Wednesday   02             1
# Returns heatmap for Chicago dataframe
plot_heatmap(detail_chi)

Washington D.C.


# Extracts hour and day of week from Washington dataframe
hour_day_wash <- extract_hour_day(wash, "Start.Time")

# Returns usage dataframe
detail_wash <- usage_df(hour_day_wash)

# Finds top 5 highest usage times
usage_summary3 <- detail_wash %>%
  arrange(desc(trip_count)) %>%
  slice(c(1, n()))

# Prints top 5 usage times
usage_summary3
## # A tibble: 2 × 3
##   day_of_week hour  trip_count
##   <fct>       <chr>      <int>
## 1 Wednesday   08          1962
## 2 Thursday    00            19
# Returns heatmap for Washington dataframe
plot_heatmap(detail_wash)


Summary

The heatmaps show different levels of usage throughout each hour of the day by day of the week for each city. The heatmap for New York shows that the days and times with the heaviest usage are on weekdays between 7am - 9am and 5pm - 7pm. Weekends during the day also seem to be popular in New York. The day and time with the highest usage in New York is Wednesday at 5pm while the lowest usage is Tuesday at 3am.

This pattern is also mirrored pretty similarly in Chicago where the heaviest usage times are weekdays from around 7am - 8am and around 4pm - 6pm. Weekends during the day also seem to be popular. The day and time with the highest usage in Chicago is Thursday at 5pm and the lowest usage is Wednesday at 2 am. The weekday pattern could be due to professionals utilizing bike sharing to commute to and from work. The weekend patterns could be due to riders using the bikes for leisure.

Interestingly, in Washington D.C the heaviest usage times are on weekdays from around 5am - 8am, however this is not mirrored in the evening as expected. For weekends, it seems that the late night/ early morning time is the most popular. The day and time with the highest usage is Wednesday at 8am and the lowest usage is Thursday at 12am.